import pysrt
import pandas as pd
import numpy as np
import re
import nltk
import eli5
import lime
from nltk.corpus import stopwords
# nltk.download("stopwords")
# nltk.download("punkt")
from string import punctuation
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.model_selection import train_test_split
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index_to_key)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
# remove_nois function
def remove_noise(text):
text = re.sub("<.*>", " ", text)
text = re.sub("{.*}", " ", text)
text = re.sub("\[.*\]", " ", text)
text = re.sub("- \n-", " ", text)
text = re.sub("-", " ", text) # Removes hyphens
text = re.sub("_", " ", text) # Removes underscores
text = re.sub("&", " ", text) # Removes &
text = text.strip()
return text
# impurity function
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
# RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\"]')
def impurity(text, min_len=10):
"""returns the share of suspicious characters in a text"""
if text == None or len(text) < min_len:
return 0
else:
return len(RE_SUSPICIOUS.findall(text))/len(text)
# rest of stuff
# import textacy.preprocessing as tprep
subs = pysrt.open("Interstellar.srt")
DF = pd.DataFrame([
{
"Text": sub.text
} for sub in subs])
DF.head()
Set up Text
DF['clean'] = DF['Text'].apply(remove_noise)
DF = DF[DF['clean'] != ""]
# DF.head(10)
DF
# # deal with the errors
# def clean_errors(text):
# text = re.sub("</?[a-z]+>", "", text)
# text = re.sub("♪", "", text)
# return text
# DF['clean'] = DF['Text'].apply(clean_errors)
Do preprocessing on your text to prepare it for the final machine learning model.
Creating 2 preprocessing process by defining 2 Clean function:
- preprocessing 1: with stemming
- preprocessing 2: without stemming.
stops = set(stopwords.words("english"))
stops.add("...")
stops.add("'s")
stops.add("'t")
stops.add("'m")
stops.add("'re")
stops.add("'ll")
stops.add("'ve")
stops.add("'d")
stops.add("um")
stops.add("uh")
stops.add("heh")
stops.add("Whoa")
stemmer = PorterStemmer()
def clean_up(text):
# lower case everything so it's normalized
text = text.lower()
#clean up apostrophes
text = re.sub("'", "'", text)
# remove stop words
text = " ".join([word for word in nltk.word_tokenize(text) if word not in stops])
# get rid of punctuation
text = " ".join([word for word in nltk.word_tokenize(text) if not word.isdigit() and word not in punctuation])
# remove them here
text = re.sub("'", "", text)
text = re.sub("`", "", text)
# stemming might be good
text = stemmer.stem(text)
# take out the white space
return text.strip()
def clean_no_stem(text):
# lower case everything so it's normalized
text = text.lower()
#clean up apostrophes
text = re.sub("'", "'", text)
# remove stop words
text = " ".join([word for word in nltk.word_tokenize(text) if word not in stops])
# get rid of punctuation
text = " ".join([word for word in nltk.word_tokenize(text) if not word.isdigit() and word not in punctuation])
# remove them here
text = re.sub("'", "", text)
text = re.sub("`", "", text)
# stemming might be good
# text = stemmer.stem(text)
# take out the white space
return text.strip()
DF['clean2'] = DF['clean'].apply(clean_up)
DF['clean3'] = DF['clean'].apply(clean_no_stem)
DF
- So one thing that you can do for the assignment you could you will try is two different preprocessors
- So right now we're going to include both, but later we could try turning it off, turning off the stemming and seeing of our answers gets better.
What items do you think will be important in your preprocessing to clean?
I have applied several functions to clean the text:
- • remove_noise function: This function uses “regular expression” to removes noises like “\n_ -“and so on. Then I deleted empty cells from data.
- • Stopwords from nltk that removes stop words. I also checked the datafarme and added any other stopwords that I see in the data such as”Uh, ‘s,..”.
- • Clean_up function that do lower casing, tokenizing, stemming, and removes also some regular expression as well.
so We did :
- regex
- Stemming
- lower casing
- punctuation
- stop words
- among them stemming seems that did not change the results that much.
- for cleaning the text we applied stopwords and then by looking at the results, we added some other stopwordes that we found in the text like: "'ve" and
PREDICTION OF SENTIMENT
- Use textblob to create sentiment scores.
- Assign every thing above 0 to be positive, everything 0 and below to be negative.
- Use this column as your "label".
- What items do you think will be important in your preprocessing to clean for your specific text? Write a summary of the different considerations you used for determining the cleaning for classification.
- we use "clean1" column to do sentiment or the nouns because sentiment's gonna work on full words. It doesn't work well on very cleaned up data like "clean2".
Create Feature Extractions
- Create our labels
- noun example
- dr b will add this later
sentiment example
- We run 4 models and then xhech the results to see which one is better.
- we done some preprocessing step so that affects the outcome.
- Then feature extraction step, and that's going to affect the outcome.
- So we've got like five or six options that we're just gonna play with.
- If you have a huge data set that you're trying to build a big model on, what I tell people do is randomly sample from it, test a bunch of feature extractions, and it'll be real clear real fast which ones don't work at all.
- we might have 5 here in this example.
- How many were doing 1 hot TFIDF bag of words?
- 2 words Vivek model.
- So we're going to do 5.
- It'll be real obvious like ohh these two are not gonna work.
- So you would get rid of them and then tune the models some more and pick the best one from that and so you can do these sort of you can do a whole bunch of them without taking a lot of computation time and effort, using maybe a random small data set to weed out the ones that are like.
- Nope, this this data and that just doesn't work and you'll see when we when we do it umm, there will probably be at least two that are just don't work at all.
- Maybe like wow, these answers are crap.
- So we would get rid of it.
that is 100% easier to use the count vectorizer and say do the one hot encoding and they they give you roughly the same structure and that's because the the models that we're going to feed this data into, right.
# remove the empty cells
DF = DF[DF['clean2'].str.len() > 1]
# add fake labels (this is just for class)
# do this on clean with full words not totally processed for sentiment
DF['sent_score'] = [TextBlob(text).sentiment.polarity for text in DF['clean'].to_list()]
# # assign a label
DF['sent_label'] = DF['sent_score'] > 0
DF.head()
DF = DF[DF['clean3'].str.len() > 1]
# add fake labels (this is just for class)
# do this on clean with full words not totally processed for sentiment
DF['sent3_score'] = [TextBlob(text).sentiment.polarity for text in DF['clean'].to_list()]
# # assign a label
DF['sent3_label'] = DF['sent3_score'] > 0
DF.head()
Balancing
DF['sent_label'].value_counts()
DF['sent3_label'].value_counts()
# DF['sent_label'].value_counts()
# create balanced data
DFB = DF.groupby('sent_label').sample(n = 362)
DFB['sent_label'].value_counts()
# DF['sent_label'].value_counts()
# create balanced data
DFB = DF.groupby('sent3_label').sample(n = 362)
DFB['sent3_label'].value_counts()
The data is imbalanced
Create test-train on Imbalanced and balanced Data
- PreProcessing 1 (with Stemming)
X_train, X_test, Y_train, Y_test = train_test_split(DF['clean2'], # X values
DF['sent_label'], # Y values
test_size = 0.2, # test size
random_state = 42)
print('Size of Imbalanced Training Data ', X_train.shape[0])
print('Size of Imbalanced Test Data ', X_test.shape[0])
XB_train, XB_test, YB_train, YB_test = train_test_split(DFB['clean2'], # X values
DFB['sent_label'], # Y values
test_size = 0.2, # test size
random_state = 42)
print('Size of Balanced Training Data ', XB_train.shape[0])
print('Size of Balanced Test Data ', XB_test.shape[0])
Size of Inbalanced Test Data 358
Size of Balanced Training Data 579
Size of Balanced Test Data 145
- PreProcessing 2 (with NO Stemming)
Xs_train, Xs_test, Ys_train, Ys_test = train_test_split(DF['clean3'], # X values
DF['sent3_label'], # Y values
test_size = 0.2, # test size
random_state = 42)
print('Size of Imbalanced Training Data ', Xs_train.shape[0])
print('Size of Imbalanced Test Data ', Xs_test.shape[0])
XsB_train, XsB_test, YsB_train, YsB_test = train_test_split(DFB['clean3'], # X values
DFB['sent3_label'], # Y values
test_size = 0.2, # test size
random_state = 42)
print('Size of Balanced Training Data ', XsB_train.shape[0])
print('Size of Balanced Test Data ', XsB_test.shape[0])
Size of Imbalanced Test Data 358
Size of Balanced Training Data 579
Size of Balanced Test Data 145
Feature extraction (“one-hot” encoding using the count vectorizer and binary options)
- PreProcessing 1 (with Stemming)
# build a blank setup
oh = CountVectorizer(binary = True)
# fit the training data
oh_u_train = oh.fit_transform(X_train)
# transform the testing data to look like the training data
oh_u_test = oh.transform(X_test)
print('Stemming - Inbalanced data:')
print(oh_u_train.shape)
print(oh_u_test.shape)
# build a blank setup
oh = CountVectorizer(binary = True)
# fit the training data
oh_b_train = oh.fit_transform(XB_train)
# transform the testing data to look like the training data
oh_b_test = oh.transform(XB_test)
print('Stemming - Balanced data:')
print(oh_b_train.shape)
print(oh_b_test.shape)
(1432, 1675)
(358, 1675)
Stemming - Balanced data:
(579, 995)
(145, 995)
- PreProcessing 2 (with NO Stemming)
# build a blank setup
oh = CountVectorizer(binary = True)
# fit the training data
oh_su_train = oh.fit_transform(Xs_train)
# transform the testing data to look like the training data
oh_su_test = oh.transform(Xs_test)
print('No stemming - Inbalanced data:')
print(oh_su_train.shape)
print(oh_su_test.shape)
# build a blank setup
oh = CountVectorizer(binary = True)
# fit the training data
oh_sb_train = oh.fit_transform(XsB_train)
# transform the testing data to look like the training data
oh_sb_test = oh.transform(XsB_test)
print('No stemming - Balanced data:')
print(oh_sb_train.shape)
print(oh_sb_test.shape)
(1432, 1565)
(358, 1565)
No stemming - Balanced data:
(579, 951)
(145, 951)
1675 is the number of unique vocabulary items that it found.
We lost about 700 data point by balancing the dataset. We will see how it will affect the results
Create the "bag of words" encoding using the count vectorizer.
- PreProcessing 1 (with Stemming)
# build a blank setup
bow = CountVectorizer()
# fit the training data
bow_u_train = bow.fit_transform(X_train)
# transform the testing data to look like the training data
bow_u_test = bow.transform(X_test)
print('With Stemming - Inbalanced data:')
print(bow_u_train.shape)
print(bow_u_test.shape)
# build a blank setup
bow = CountVectorizer()
# fit the training data
bow_b_train = bow.fit_transform(XB_train)
# transform the testing data to look like the training data
bow_b_test = bow.transform(XB_test)
print('With Stemming - Balanced data:')
print(bow_b_train.shape)
print(bow_b_test.shape)
(1432, 1675)
(358, 1675)
With Stemming - Balanced data:
(579, 995)
(145, 995)
- PreProcessing 2 (with NO Stemming)
# build a blank setup
bow = CountVectorizer()
# fit the training data
bow_su_train = bow.fit_transform(Xs_train)
# transform the testing data to look like the training data
bow_su_test = bow.transform(Xs_test)
print('No Stemming - Inbalanced data:')
print(bow_su_train.shape)
print(bow_su_test.shape)
# build a blank setup
bow = CountVectorizer()
# fit the training data
bow_sb_train = bow.fit_transform(XsB_train)
# transform the testing data to look like the training data
bow_sb_test = bow.transform(XsB_test)
print('No Stemming - Balanced data:')
print(bow_sb_train.shape)
print(bow_sb_test.shape)
(1432, 1565)
(358, 1565)
No Stemming - Balanced data:
(579, 951)
(145, 951)
So we should see the same shapes, they out same output shapes because the vocabulary pulling is the same for each of these.
Each of these three methods should give you the same vocabulary outputs here and then we should get the same size outputs because they're the same data sets.
Classify
- Use at least two classification algorithms to predict the outcome of the data.
- Include the model assessment of these predictions for all models.
Log Unbalanced - PreProssecing 1 -With Stemming
## one hot log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(oh_u_train, Y_train) #training features not X, Y_train
y_log = logreg.predict(oh_u_test) #testing features not X
print(classification_report(Y_test, y_log))
## bow log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(bow_u_train, Y_train) #training features not X, Y_train
y_log = logreg.predict(bow_u_test) #testing features not X
print(classification_report(Y_test, y_log))
False 0.88 0.99 0.93 284
True 0.95 0.49 0.64 74
accuracy 0.89 358
macro avg 0.91 0.74 0.79 358
weighted avg 0.89 0.89 0.87 358
precision recall f1-score support
False 0.88 0.99 0.93 284
True 0.95 0.49 0.64 74
accuracy 0.89 358
macro avg 0.91 0.74 0.79 358
weighted avg 0.89 0.89 0.87 358
Log Balanced - PreProssecing 1 -With Stemming
## one hot log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(oh_b_train, YB_train) #training features not X, YB_train
y_log = logreg.predict(oh_b_test) #testing features not X
print(classification_report(YB_test, y_log))
## bow log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(bow_b_train, YB_train) #training features not X, YB_train
y_log = logreg.predict(bow_b_test) #testing features not X
print(classification_report(YB_test, y_log))
False 0.82 0.87 0.84 82
True 0.81 0.75 0.78 63
accuracy 0.81 145
macro avg 0.81 0.81 0.81 145
weighted avg 0.81 0.81 0.81 145
precision recall f1-score support
False 0.80 0.85 0.83 82
True 0.79 0.73 0.76 63
accuracy 0.80 145
macro avg 0.80 0.79 0.79 145
weighted avg 0.80 0.80 0.80 145
NB Unbalanced - PreProssecing 1 -With Stemming
## one hot log unbalanced
nb = MultinomialNB()
nb.fit(oh_u_train, Y_train) #training features not X, Y_train
y_nb = nb.predict(oh_u_test) #testing features not X
print(classification_report(Y_test, y_nb))
## bow log unbalanced
nb = MultinomialNB()
nb.fit(bow_u_train, Y_train) #training features not X, Y_train
y_nb = nb.predict(bow_u_test) #testing features not X
print(classification_report(Y_test, y_nb))
False 0.90 0.99 0.94 284
True 0.94 0.59 0.73 74
accuracy 0.91 358
macro avg 0.92 0.79 0.84 358
weighted avg 0.91 0.91 0.90 358
precision recall f1-score support
False 0.90 0.99 0.94 284
True 0.94 0.59 0.73 74
accuracy 0.91 358
macro avg 0.92 0.79 0.84 358
weighted avg 0.91 0.91 0.90 358
NB Balanced - PreProssecing 1 -With Stemming
# one hot NB unbalanced
nb = MultinomialNB()
nb.fit(oh_b_train, YB_train) #training features not X, YB_train
y_nb = nb.predict(oh_b_test) #testing features not X
print(classification_report(YB_test, y_nb))
## bow NB unbalanced
nb = MultinomialNB()
nb.fit(bow_b_train, YB_train) #training features not X, YB_train
y_nb = nb.predict(bow_b_test) #testing features not X
print(classification_report(YB_test, y_nb))
False 0.81 0.52 0.64 82
True 0.58 0.84 0.68 63
accuracy 0.66 145
macro avg 0.69 0.68 0.66 145
weighted avg 0.71 0.66 0.66 145
precision recall f1-score support
False 0.81 0.54 0.65 82
True 0.58 0.84 0.69 63
accuracy 0.67 145
macro avg 0.70 0.69 0.67 145
weighted avg 0.71 0.67 0.66 145
Log Unbalanced - PreProssecing 2 - NO Stemming
## one hot log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(oh_su_train, Ys_train) #training features not X, Y_train
ys_log = logreg.predict(oh_su_test) #testing features not X
print(classification_report(Ys_test, ys_log))
## bow log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(bow_su_train, Ys_train) #training features not X, Y_train
ys_log = logreg.predict(bow_su_test) #testing features not X
print(classification_report(Ys_test, ys_log))
False 0.88 0.99 0.93 284
True 0.92 0.47 0.62 74
accuracy 0.88 358
macro avg 0.90 0.73 0.78 358
weighted avg 0.89 0.88 0.87 358
precision recall f1-score support
False 0.88 0.99 0.93 284
True 0.92 0.47 0.62 74
accuracy 0.88 358
macro avg 0.90 0.73 0.78 358
weighted avg 0.89 0.88 0.87 358
Log Balanced - PreProssecing 2 - No Stemming
## one hot log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(oh_sb_train, YsB_train) #training features not X, YB_train
ys_log = logreg.predict(oh_sb_test) #testing features not X
print(classification_report(YsB_test, ys_log))
## bow log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(bow_sb_train, YsB_train) #training features not X, YB_train
ys_log = logreg.predict(bow_sb_test) #testing features not X
print(classification_report(YsB_test, ys_log))
False 0.80 0.88 0.84 82
True 0.82 0.71 0.76 63
accuracy 0.81 145
macro avg 0.81 0.80 0.80 145
weighted avg 0.81 0.81 0.80 145
precision recall f1-score support
False 0.80 0.85 0.83 82
True 0.79 0.73 0.76 63
accuracy 0.80 145
macro avg 0.80 0.79 0.79 145
weighted avg 0.80 0.80 0.80 145
NB Unbalanced - PreProssecing 2 - No Stemming
## one hot NB unbalanced
nb = MultinomialNB()
nb.fit(oh_su_train, Ys_train) #training features not X, Y_train
ys_nb = nb.predict(oh_su_test) #testing features not X
print(classification_report(Ys_test, ys_nb))
## bow NB unbalanced
nb = MultinomialNB()
nb.fit(bow_su_train, Ys_train) #training features not X, Y_train
ys_nb = nb.predict(bow_su_test) #testing features not X
print(classification_report(Ys_test, ys_nb))
False 0.90 0.99 0.94 284
True 0.91 0.58 0.71 74
accuracy 0.90 358
macro avg 0.91 0.78 0.83 358
weighted avg 0.90 0.90 0.89 358
precision recall f1-score support
False 0.90 0.99 0.94 284
True 0.91 0.58 0.71 74
accuracy 0.90 358
macro avg 0.91 0.78 0.83 358
weighted avg 0.90 0.90 0.89 358
NB Balanced - PreProssecing 2 - No Stemming
# one hot log unbalanced
nb = MultinomialNB()
nb.fit(oh_sb_train, YsB_train) #training features not X, YB_train
ys_nb = nb.predict(oh_sb_test) #testing features not X
print(classification_report(YsB_test, ys_nb))
## bow log unbalanced
nb = MultinomialNB()
nb.fit(bow_sb_train, YsB_train) #training features not X, YB_train
ys_nb = nb.predict(bow_sb_test) #testing features not X
print(classification_report(YsB_test, ys_nb))
False 0.81 0.54 0.65 82
True 0.58 0.84 0.69 63
accuracy 0.67 145
macro avg 0.70 0.69 0.67 145
weighted avg 0.71 0.67 0.66 145
precision recall f1-score support
False 0.81 0.57 0.67 82
True 0.60 0.83 0.69 63
accuracy 0.68 145
macro avg 0.70 0.70 0.68 145
weighted avg 0.72 0.68 0.68 145
Interpret
from IPython.display import Image
Image(filename='image.png')
Summary:
We used heare 2 classification mehod: Logistic regressiona and Naive Bayes. For imbalanced and balanced data set. and with considering Stemming as a part of preprocessing and NOT stemming. The results shows:
-------- balancing the data did not help
- Actually both of Logistic regression and Naive Bayes models did better on Imbalanced data.
- Or, better say that balancing the data did not help! and we need more data point to get the better result. balancing helps at 10:1 ratios and above
-------The type of feature extraction slightly matters
- BOW give us slightly better accuracy in most of the models and preprocessing situations.
------- the algorithm, the classsification method matters, but slighlty
- On Balanced dataset, Logestic regression gave us higher accuracywith both stemming and no stemming preprocessing
- On imbalanced dataser NB give us slightly better result. But not that much difference
- Since NB is a simpler model than Log, We choose NB
Which model Wins?
-
Imbalanced dataset:
-
feature extraction methods: feature extraction (OH or BOW) do not matter
-
models:NB
-
preprocessing: Stemming does not matter
Balanced dataset:
- feature extraction methods: BOW
- models:Log
- preprocessing: with stemming
So, between similar accuracy for different models we choose the more simple model . Here the winner is NB for imbalanced dataset using One Hot and BOW as feature extraction with No stemming.
# build a blank setup
bow = CountVectorizer()
# fit the training data
bow_su_train = bow.fit_transform(Xs_train)
# transform the testing data to look like the training data
bow_su_test = bow.transform(Xs_test)
# ## one hot log unbalanced
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(oh_su_train, Ys_train) #training features not X, Y_train
ys_log = logreg.predict(oh_su_test) #testing features not X
print(classification_report(Ys_test, ys_log))
False 0.88 0.99 0.93 284
True 0.92 0.47 0.62 74
accuracy 0.88 358
macro avg 0.90 0.73 0.78 358
weighted avg 0.89 0.88 0.87 358
- Use eli5 to determine what predicts each category label.
- Interpret the results by writing a paragraph explaining the output from this package.
import eli5
eli5.show_weights(estimator = logreg, top = 10, feature_names = bow.get_feature_names_out())
# build a blank setup
bow = CountVectorizer()
# fit the training data
bow_su_train = bow.fit_transform(Xs_train)
# transform the testing data to look like the training data
bow_su_test = bow.transform(Xs_test)
## bow NB unbalanced
nb = MultinomialNB()
nb.fit(bow_su_train, Ys_train) #training features not X, Y_train
ys_nb = nb.predict(bow_su_test) #testing features not X
print(classification_report(Ys_test, ys_nb))
False 0.90 0.99 0.94 284
True 0.91 0.58 0.71 74
accuracy 0.90 358
macro avg 0.91 0.78 0.83 358
weighted avg 0.90 0.90 0.89 358
# we need to make a pipeline from sklearn
pipeline = make_pipeline(bow, nb)
pipeline.predict_proba(["Murph is great"])
# we need to make a pipeline from sklearn
pipeline = make_pipeline(bow, nb)
pipeline.predict_proba(["Murf is terrible"])
Interpret
Here we check how our model (here Naive Bayes) classify a given sentence as positive or negative sentence.
- If we look at "Murph is terrible", it classify it as False or negative sentence with pob of 80%
- If we look at "Murph is great", it classify it as True or positive sentence with pob of 62%
# then we build the "explainer" which is a blank model that has the class names
explainer = LimeTextExplainer(class_names = Ys_train.sort_values().unique())
# and then we apply the pipeline and explainer
# to new or old text
exp = explainer.explain_instance("Murph is terrible", # text
pipeline.predict_proba, # put in the answers from pipeline
num_features=10) # the number of features
exp = explainer.explain_instance(DF['clean3'].iloc[20], # text
pipeline.predict_proba, # put in the answers from pipeline
num_features=10) # the number of features
exp.as_pyplot_figure()
plt.show()
exp.save_to_file('Interstellar.html')
DF['clean3'].iloc[20]
Inerpret
Using Explainer, we can check a sentence, a cell of our dataframe, to see each word is classified as positive or nefgative. For example 'set table always set plate', "always" is classified as positiove with probability of 13% and other words are classified as negative. The whole sentence is classified as positive with prob of 84%